Table. eGene Types


In [1]:
import copy
import cPickle
import os
import subprocess

import cdpybio as cpb
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns

import ciepy
import cardipspy as cpy

%matplotlib inline

dy_name = 'table_egene_types_and_gwas'
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)

In [2]:
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)

In [3]:
fn = os.path.join(ciepy.root, 'output', 'gwas_analysis', 'pe_no_hla_grasp_counts.tsv')
grasp_counts = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'gwas_analysis', 'grasp_results.tsv')
grasp_res = pd.read_table(fn, index_col=0)
#grasp_res['phenotype'] = grasp_res.phenotype.apply(lambda x: x.split(' (')[0])

grasp_counts.index = grasp_res.ix[grasp_counts.index, 'phenotype']
grasp_res.index = grasp_res.phenotype
grasp_res = grasp_res.ix[[x for x in grasp_res.index if 'xpression' not in x]]

t = grasp_res[['lead_odds', 'pe_odds', 'pe_no_hla_odds']]
t['odds_sum'] = t.sum(axis=1)
t.sort_values(by='odds_sum', inplace=True, ascending=False)
t = t.drop('odds_sum', axis=1)

grasp_res = grasp_res.ix[t.index]

grasp_res.columns = [x.replace('pe_', 'peqtn_') for x in grasp_res.columns]
grasp_res = grasp_res[[x for x in grasp_res.columns if 'bh' not in x]].drop('phenotype', axis=1)
grasp_res = grasp_res[[x for x in grasp_res.columns if 'peqtn' not in x]]
grasp_res.columns = [x.replace('lead_', '') for x in grasp_res.columns]

grasp_res = grasp_res.sort_values(by='pvalue')

In [4]:
dy = os.path.join(ciepy.root, 'output/eqtl_processing/eqtls01')
fn = os.path.join(dy, 'qvalues.tsv')
qvalues = pd.read_table(fn, index_col=0)

In [5]:
sig = qvalues[qvalues.perm_sig]

In [6]:
a = gene_info.ix[sig.index, 'gene_type'].value_counts()
b = gene_info.ix[qvalues.index, 'gene_type'].value_counts()
t = pd.concat([a, b], axis=1)
t = t.fillna(0)
t.columns = ['Significant', 'Tested']
t = t[['Tested', 'Significant']]
t.sort_values(by=['Tested', 'Significant'], inplace=True, ascending=False)
t['Percent significant'] = (t.Significant / t.Tested).round(3) * 100
t['Percent eGenes'] = (t.Significant / t.Significant.sum()).round(3) * 100
t.index = [x.replace('_', ' ') for x in t.index]

In [7]:
t


Out[7]:
Tested Significant Percent significant Percent eGenes
protein coding 13837 4622 33.4 80.4
pseudogene 1393 334 24.0 5.8
lincRNA 1049 362 34.5 6.3
antisense 926 269 29.0 4.7
processed transcript 223 92 41.3 1.6
sense intronic 133 29 21.8 0.5
snoRNA 69 4 5.8 0.1
misc RNA 59 5 8.5 0.1
sense overlapping 54 16 29.6 0.3
snRNA 30 0 0.0 0.0
polymorphic pseudogene 15 8 53.3 0.1
3prime overlapping ncrna 7 1 14.3 0.0
miRNA 3 1 33.3 0.0
TR C gene 2 2 100.0 0.0
IG V gene 2 1 50.0 0.0
TR V gene 2 0 0.0 0.0
IG V pseudogene 1 0 0.0 0.0

In [8]:
writer = pd.ExcelWriter(os.path.join(outdir, 'egene_types_gwas.xlsx'))
t.to_excel(writer, sheet_name='eGene summary')
grasp_res.to_excel(writer, sheet_name='GWAS enrichments')
writer.save()